# R script for the Homework 5 of 19-704
# Replication and extension of the results 
# from Goldstein et al. (1993)

setwd("C:/Users/Alexandra/Documents/R")

install.packages("mlmRev", repos = "http://lib.stat.cmu.edu/R/CRAN/")
library(mlmRev)
data("Exam", package = "mlmRev")
exam <- data.frame(Exam)
head(exam)
summary(exam)

# Question 1 ---------------------------------------------------------
exam <- transform(exam, sch = as.numeric(school))
head(exam)
summary(exam$sch)
png(filename = "HW5_Cosseron_histogram1_Q1.png",
    width = 3000,
    height = 3000,
    res = 300)
par(cex = 1.3)
hist(exam$sch,
     breaks = c(1: 65),
     xlim = c(1, 65),
     ylim = c(0, 200),
     xlab = "Number of students in each school",
     main = "")
dev.off()

summary(exam$normexam)
head(exam)
png(filename = "HW5_Cosseron_histogram2_Q1.png",
    width = 3000,
    height = 3000,
    res = 300)
par(cex = 1.3)
hist(exam$normexam,
     breaks = "fd",
     xlim = c(-4, 4),
     ylim = c(0, 500),
     xlab = "Normalized scores on the GCSE exams",
     main = "")
dev.off()

summary(exam$intake)
exam <- transform(exam, int = as.numeric(intake))
head(exam)
summary(exam$int)
xlabel <- expression(atop("Average London Reading Test intake scores", "1 = bottom 25%; 2 = mid 50%; 3 = top 25%"))
png(filename = "HW5_Cosseron_histogram3_Q1.png",
    width = 3000,
    height = 3000,
    res = 300)
par(cex = 1.3)
hist(exam$int,
     breaks = "fd",
     xlim = c(1, 3),
     ylim = c(0, 2500),
     xlab = xlabel,
     main = "")
dev.off()

summary(exam$standLRT)
png(filename = "HW5_Cosseron_histogram4_Q1.png",
    width = 3000,
    height = 3000,
    res = 300)
par(cex = 1.3)
hist(exam$standLRT,
     breaks = "fd",
     xlim = c(-3, 4),
     ylim = c(0, 500),
     xlab = "standardized London Reading Test intake scores",
     main = "")
dev.off()

summary(exam$schgend)
summary(exam$vr)
summary(exam$intake)
summary(exam$sex)
summary(exam$type)

# Question 2 --------------------------------------------------

# Completely Pooled Intercept -----
completepool1 <- lm(normexam ~ 1, data = exam)
summary(completepool1)

# No Pooled Intercept -------------

# Make an empty data frame to store
# the regression results for each school
no.pool <- data.frame(int         = rep(NA, 65),
                      std.error   = rep(NA, 65),
                      school      = unique(exam$school),
                      sample.size = rep(NA, 65))

for(i in unique(exam$school)) {
  # Conduct the regression for data only in school i
  gcsereg.np <- lm(normexam ~ 1, data = exam[exam$school == i, ])
  # Pull out the no pooling intercept
  no.pool$int[no.pool$school == i] <- coef(gcsereg.np)
  # Pull out the no pooling standard error for the intercept
  no.pool$std.error[no.pool$school == i] <- coef(summary(gcsereg.np))[1, 2]
  # Pull out the sample size for that school
  no.pool$sample.size[no.pool$school == i] <- nrow(exam[exam$school == i, ])
}

no.pool

# Partially Pooled Intercept ---------
install.packages("arm", repos = "http://lib.stat.cmu.edu/R/CRAN/")
library(arm)
# (1|school) indicates that we should estimate partially pooled
# school-level intercepts
gcsereg.pp <- lmer(normexam ~ 1 + (1|school), data = exam)
summary(gcsereg.pp)

# Rough overview of the distribution of the no pooling
# and partial pooling intercepts
max(ranef(gcsereg.pp)$school)
min(ranef(gcsereg.pp)$school)

mean(no.pool$int)
max(no.pool$int)
min(no.pool$int)

png(filename = "HW5_Cosseron_distrib_Q2.png",
    width = 3000,
    height = 3000,
    res = 300)
par(cex = 1.3,
    mfrow = c(2, 1))
plot(no.pool$int,
     xlab = "No Pooled Intercept")
plot(as.matrix(ranef(gcsereg.pp)$school),
     xlab = "Partially Pooled Intercept")
dev.off()

# Example for one school -----
as.matrix(ranef(gcsereg.pp)$school)[1]
no.pool$int[1]
